# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Loading excel file into dataframe(df)
df = pd.read_csv(r'C:\Users\Lenovo\OneDrive\Y2S3\Introduction to Data Science\Assignment\Dataset\owid-covid-data.csv')
# Checking the datatypes for each column
df.dtypes
iso_code object continent object location object date object total_cases float64 new_cases float64 new_cases_smoothed float64 total_deaths float64 new_deaths float64 new_deaths_smoothed float64 total_cases_per_million float64 new_cases_per_million float64 new_cases_smoothed_per_million float64 total_deaths_per_million float64 new_deaths_per_million float64 new_deaths_smoothed_per_million float64 reproduction_rate float64 icu_patients float64 icu_patients_per_million float64 hosp_patients float64 hosp_patients_per_million float64 weekly_icu_admissions float64 weekly_icu_admissions_per_million float64 weekly_hosp_admissions float64 weekly_hosp_admissions_per_million float64 new_tests float64 total_tests float64 total_tests_per_thousand float64 new_tests_per_thousand float64 new_tests_smoothed float64 new_tests_smoothed_per_thousand float64 positive_rate float64 tests_per_case float64 tests_units object total_vaccinations float64 people_vaccinated float64 people_fully_vaccinated float64 new_vaccinations float64 new_vaccinations_smoothed float64 total_vaccinations_per_hundred float64 people_vaccinated_per_hundred float64 people_fully_vaccinated_per_hundred float64 new_vaccinations_smoothed_per_million float64 stringency_index float64 population float64 population_density float64 median_age float64 aged_65_older float64 aged_70_older float64 gdp_per_capita float64 extreme_poverty float64 cardiovasc_death_rate float64 diabetes_prevalence float64 female_smokers float64 male_smokers float64 handwashing_facilities float64 hospital_beds_per_thousand float64 life_expectancy float64 human_development_index float64 excess_mortality float64 dtype: object
# Checking the size/dimension of the dataset
df.shape
(96645, 60)
# Displaying the summary of each column
df.describe()
| total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | total_cases_per_million | new_cases_per_million | new_cases_smoothed_per_million | total_deaths_per_million | ... | extreme_poverty | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9.338700e+04 | 93384.000000 | 92369.000000 | 8.332700e+04 | 83483.000000 | 92369.000000 | 92889.000000 | 92886.000000 | 91876.000000 | 82842.000000 | ... | 58663.000000 | 86900.000000 | 89064.000000 | 68010.000000 | 67015.000000 | 43663.000000 | 79206.000000 | 91773.000000 | 87077.000000 | 3407.000000 |
| mean | 1.013389e+06 | 6048.155294 | 6078.983893 | 2.770536e+04 | 147.156271 | 132.103750 | 12466.525473 | 76.148775 | 76.458143 | 278.009890 | ... | 13.392748 | 258.343287 | 7.915468 | 10.555165 | 32.683993 | 50.848538 | 3.028642 | 73.232493 | 0.727521 | 18.243505 |
| std | 6.966606e+06 | 37955.391475 | 37720.752239 | 1.651512e+05 | 805.017702 | 746.455626 | 23160.290191 | 199.635466 | 158.986921 | 506.105822 | ... | 19.955210 | 118.975576 | 4.129000 | 10.448992 | 13.481539 | 31.759905 | 2.459618 | 7.558983 | 0.150369 | 36.225191 |
| min | 1.000000e+00 | -74347.000000 | -6223.000000 | 1.000000e+00 | -1918.000000 | -232.143000 | 0.001000 | -2153.437000 | -276.825000 | 0.001000 | ... | 0.100000 | 79.370000 | 0.990000 | 0.100000 | 7.700000 | 1.188000 | 0.100000 | 53.280000 | 0.394000 | -95.590000 |
| 25% | 1.187500e+03 | 2.000000 | 7.429000 | 5.200000e+01 | 0.000000 | 0.000000 | 248.584000 | 0.207000 | 1.270000 | 7.863000 | ... | 0.600000 | 167.295000 | 5.310000 | 1.900000 | 21.600000 | 19.351000 | 1.300000 | 67.920000 | 0.602000 | 0.295000 |
| 50% | 1.297300e+04 | 73.000000 | 90.857000 | 3.640000e+02 | 2.000000 | 1.286000 | 1688.617000 | 8.219500 | 10.883500 | 49.320000 | ... | 2.200000 | 242.648000 | 7.110000 | 6.300000 | 31.400000 | 49.839000 | 2.400000 | 74.620000 | 0.748000 | 7.390000 |
| 75% | 1.371405e+05 | 801.000000 | 841.286000 | 3.573000e+03 | 18.000000 | 14.286000 | 12800.104000 | 70.419000 | 78.928000 | 301.538000 | ... | 21.200000 | 329.635000 | 10.080000 | 19.300000 | 41.100000 | 83.241000 | 3.861000 | 78.740000 | 0.848000 | 23.795000 |
| max | 1.777647e+08 | 906008.000000 | 826389.571000 | 3.849865e+06 | 18050.000000 | 14737.000000 | 179149.680000 | 18293.675000 | 4083.500000 | 5760.460000 | ... | 77.600000 | 724.417000 | 30.530000 | 44.000000 | 78.100000 | 100.000000 | 13.800000 | 86.750000 | 0.957000 | 409.690000 |
8 rows × 55 columns
# Displaying the top records of the dataset
df.head()
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | extreme_poverty | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | excess_mortality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Asia | Afghanistan | 2020-02-24 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | ... | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN |
| 1 | AFG | Asia | Afghanistan | 2020-02-25 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN |
| 2 | AFG | Asia | Afghanistan | 2020-02-26 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN |
| 3 | AFG | Asia | Afghanistan | 2020-02-27 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN |
| 4 | AFG | Asia | Afghanistan | 2020-02-28 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | 597.029 | 9.59 | NaN | NaN | 37.746 | 0.5 | 64.83 | 0.511 | NaN |
5 rows × 60 columns
# New features
df['CFR'] = (df['total_cases'] / df['total_deaths'])*100
df['CFR_per_million'] = (df['total_cases_per_million'] / df['total_deaths_per_million'])*100
df['impact'] = (df['total_cases'] * 0.1 + df['hosp_patients'] * 0.2 + df['icu_patients'] * 0.3 + df['total_deaths'] * 0.4)
df['impact_per_million'] = (df['total_cases_per_million'] * 0.1 + df['hosp_patients_per_million'] * 0.2 + df['icu_patients_per_million'] * 0.3 + df['total_deaths_per_million'] * 0.4)
# Update graph font size
plt.rcParams.update({'font.size': 22})
# Function to plot a heat map
def heat_map(dataframe):
plt.figure(figsize = (20,25))
corr = dataframe.corr()
sns.heatmap(corr[['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'new_vaccinations_smoothed_per_million', 'CFR', 'CFR_per_million']], cmap = 'Blues', annot = True)
# Function to plot scatter plot
def scatter_plot(dep, dataframe):
plt.figure()
plt.rcParams.update({'font.size': 22})
ind = ['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred','new_vaccinations_smoothed_per_million']
for i, j in enumerate(ind):
correlation = dataframe[j].corr(dataframe[dep])
ax = dataframe.plot.scatter(x=j, y=dep, c='DarkBlue', title="x: {x_name}; y: {y_name}; corr: {corr}".format(x_name = j, y_name = dep, corr = correlation), figsize=(30,45), fontsize=22)
# Plotting heatmap
heat_map(df)
# Plotting scatter plots: vaccination variables against total_cases
scatter_plot('total_cases', df)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against total_cases_per_million
scatter_plot('total_cases_per_million', df)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact
# impact = total_cases * 0.1 + hosp_patients * 0.2 + icu_patients * 0.3 + total_deaths * 0.4
scatter_plot('impact', df)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact_per_million
# impact_per_million = total_cases_per_million * 0.1 + hosp_patients_per_million * 0.2 + icu_patients_per_million * 0.3 + total_deaths_per_million * 0.4
scatter_plot('impact_per_million', df)
<Figure size 432x288 with 0 Axes>
# Defining a new dataframe for Malaysia
df_my = df[df['location'] == 'Malaysia']
df_my['CFR'] = (df_my['total_cases'] / df_my['total_deaths'])*100
df_my['CFR_per_million'] = (df_my['total_cases_per_million'] / df_my['total_deaths_per_million'])*100
df_my['impact'] = (df_my['total_cases'] * 0.1 + df_my['hosp_patients'] * 0.2 + df_my['icu_patients'] * 0.3 + df_my['total_deaths'] * 0.4)
df_my['impact_per_million'] = (df_my['total_cases_per_million'] * 0.1 + df_my['hosp_patients_per_million'] * 0.2 + df_my['icu_patients_per_million'] * 0.3 + df_my['total_deaths_per_million'] * 0.4)
<ipython-input-16-995a00b07d45>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_my['CFR'] = (df_my['total_cases'] / df_my['total_deaths'])*100 <ipython-input-16-995a00b07d45>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_my['CFR_per_million'] = (df_my['total_cases_per_million'] / df_my['total_deaths_per_million'])*100 <ipython-input-16-995a00b07d45>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_my['impact'] = (df_my['total_cases'] * 0.1 + df_my['hosp_patients'] * 0.2 + df_my['icu_patients'] * 0.3 + df_my['total_deaths'] * 0.4) <ipython-input-16-995a00b07d45>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_my['impact_per_million'] = (df_my['total_cases_per_million'] * 0.1 + df_my['hosp_patients_per_million'] * 0.2 + df_my['icu_patients_per_million'] * 0.3 + df_my['total_deaths_per_million'] * 0.4)
# Plotting heatmap
heat_map(df_my)
# Plotting scatter plots: vaccination variables against total_cases
scatter_plot('total_cases', df_my)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against total_cases_per_million
scatter_plot('total_cases_per_million', df_my)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact
scatter_plot('impact', df_my)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact_per_million
# impact_per_million = total_cases_per_million * 0.1 + hosp_patients_per_million * 0.2 + icu_patients_per_million * 0.3 + total_deaths_per_million * 0.4
scatter_plot('impact_per_million', df_my)
<Figure size 432x288 with 0 Axes>
# Defining a new dataframe for USA
df_us = df[df['iso_code'] == 'USA']
df_us['CFR'] = (df_us['total_cases'] / df_us['total_deaths'])*100
df_us['CFR_per_million'] = (df_us['total_cases_per_million'] / df_us['total_deaths_per_million'])*100
df_us['impact'] = (df_us['total_cases'] * 0.1 + df_us['hosp_patients'] * 0.2 + df_us['icu_patients'] * 0.3 + df_us['total_deaths'] * 0.4)
df_us['impact_per_million'] = (df_us['total_cases_per_million'] * 0.1 + df_us['hosp_patients_per_million'] * 0.2 + df_us['icu_patients_per_million'] * 0.3 + df_us['total_deaths_per_million'] * 0.4)
<ipython-input-22-1a5ceb3b2ae5>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_us['CFR'] = (df_us['total_cases'] / df_us['total_deaths'])*100 <ipython-input-22-1a5ceb3b2ae5>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_us['CFR_per_million'] = (df_us['total_cases_per_million'] / df_us['total_deaths_per_million'])*100 <ipython-input-22-1a5ceb3b2ae5>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_us['impact'] = (df_us['total_cases'] * 0.1 + df_us['hosp_patients'] * 0.2 + df_us['icu_patients'] * 0.3 + df_us['total_deaths'] * 0.4) <ipython-input-22-1a5ceb3b2ae5>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_us['impact_per_million'] = (df_us['total_cases_per_million'] * 0.1 + df_us['hosp_patients_per_million'] * 0.2 + df_us['icu_patients_per_million'] * 0.3 + df_us['total_deaths_per_million'] * 0.4)
# Plotting heatmap
heat_map(df_us)
# Plotting scatter plots: vaccination variables against total_cases
scatter_plot('total_cases', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against total_cases_per_million
scatter_plot('total_cases_per_million', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact
# impact = total_cases * 0.1 + hosp_patients * 0.2 + icu_patients * 0.3 + total_deaths * 0.4
scatter_plot('impact', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against impact_per_million
# impact_per_million = total_cases_per_million * 0.1 + hosp_patients_per_million * 0.2 + icu_patients_per_million * 0.3 + total_deaths_per_million * 0.4
scatter_plot('impact_per_million', df_us)
<Figure size 432x288 with 0 Axes>
# Define new_impact feature
# new_impact = new_cases * 0.1 + hosp_patients * 0.2 + icu_patients * 0.3 + new_deaths * 0.4
df_us['new_impact'] = df_us['new_cases'] * 0.1 + df_us['hosp_patients'] * 0.2 + df_us['icu_patients'] * 0.3 + df_us['new_deaths'] * 0.4
<ipython-input-28-bfd8534c9706>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_us['new_impact'] = df_us['new_cases'] * 0.1 + df_us['hosp_patients'] * 0.2 + df_us['icu_patients'] * 0.3 + df_us['new_deaths'] * 0.4
# Plotting scatter plots: vaccination variables against new_cases
scatter_plot('new_cases', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against hosp_patients
scatter_plot('hosp_patients', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against icu_patients
scatter_plot('icu_patients', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against icu_patients
scatter_plot('icu_patients', df_us)
<Figure size 432x288 with 0 Axes>
# Plotting scatter plots: vaccination variables against new_deaths
scatter_plot('new_impact', new_deaths)
<Figure size 432x288 with 0 Axes>